In [1372]:
#Loading the libraries
!pip install matplotlib.backends.backend_pdf
import pandas as pd     #For general purpose Data Manipulation
import numpy as np  #For some operations - creating vectors and matrices/log transformation
from matplotlib.backends.backend_pdf import PdfPages #Plotting Box-Whisker
from matplotlib import pyplot as plt #Plotting Box-Whisker --> Outlier Detection
import seaborn as sns  #Plotting Box-Whisker and others
from sklearn.model_selection import train_test_split  #For Building Train and Test Set
from sklearn.linear_model import LinearRegression    #For Regression Analysis
from sklearn.preprocessing import LabelEncoder #for the columns who will suffer from curse of dimensionality
from sklearn.preprocessing import OneHotEncoder #with less number of columns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
Collecting matplotlib.backends.backend_pdf
  Retrying (Retry(total=4, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.VerifiedHTTPSConnection object at 0x00000137527E99B0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed')': /simple/matplotlib-backends-backend-pdf/
  Retrying (Retry(total=3, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.VerifiedHTTPSConnection object at 0x00000137527E97F0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed')': /simple/matplotlib-backends-backend-pdf/
  Retrying (Retry(total=2, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.VerifiedHTTPSConnection object at 0x00000137527E9470>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed')': /simple/matplotlib-backends-backend-pdf/
  Retrying (Retry(total=1, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.VerifiedHTTPSConnection object at 0x00000137527E9550>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed')': /simple/matplotlib-backends-backend-pdf/
  Retrying (Retry(total=0, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.VerifiedHTTPSConnection object at 0x00000137527E91D0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed')': /simple/matplotlib-backends-backend-pdf/
  Could not find a version that satisfies the requirement matplotlib.backends.backend_pdf (from versions: )
No matching distribution found for matplotlib.backends.backend_pdf
In [566]:
train=pd.read_csv("D:/Assignment1_PGD B7/Train.csv")
train.head()
Out[566]:
Item_Identifier Item_Weight Item_Fat_Content Item_Visibility Item_Type Item_MRP Outlet_Identifier Outlet_Establishment_Year Outlet_Size Outlet_Location_Type Outlet_Type Item_Outlet_Sales
0 FDA15 9.30 Low Fat 0.016047 Dairy 249.8092 OUT049 1999 Medium Tier 1 Supermarket Type1 3735.1380
1 DRC01 5.92 Regular 0.019278 Soft Drinks 48.2692 OUT018 2009 Medium Tier 3 Supermarket Type2 443.4228
2 FDN15 17.50 Low Fat 0.016760 Meat 141.6180 OUT049 1999 Medium Tier 1 Supermarket Type1 2097.2700
3 FDX07 19.20 Regular 0.000000 Fruits and Vegetables 182.0950 OUT010 1998 NaN Tier 3 Grocery Store 732.3800
4 NCD19 8.93 Low Fat 0.000000 Household 53.8614 OUT013 1987 High Tier 3 Supermarket Type1 994.7052
In [729]:
X=train.iloc[:,:-1]
Y=train["Item_Outlet_Sales"]
In [697]:
train.head()
Out[697]:
Item_Identifier Item_Weight Item_Fat_Content Item_Visibility Item_Type Item_MRP Outlet_Identifier Outlet_Establishment_Year Outlet_Size Outlet_Location_Type Outlet_Type Item_Outlet_Sales
0 FDA15 9.30 Low Fat 0.016047 Dairy 249.8092 OUT049 1999.0 Medium Tier 1 Supermarket Type1 3735.1380
1 DRC01 5.92 Regular 0.019278 Soft Drinks 48.2692 OUT018 2009.0 Medium Tier 3 Supermarket Type2 443.4228
2 FDN15 17.50 Low Fat 0.016760 Meat 141.6180 OUT049 1999.0 Medium Tier 1 Supermarket Type1 2097.2700
3 FDX07 19.20 Regular 0.000000 Fruits and Vegetables 182.0950 OUT010 1998.0 NaN Tier 3 Grocery Store 732.3800
4 NCD19 8.93 Low Fat 0.000000 Household 53.8614 OUT013 1987.0 High Tier 3 Supermarket Type1 994.7052
In [764]:
Y.head()
Y.shape
Out[764]:
(8523,)
In [699]:
#Calculating the shape of original train data set
train.shape
Out[699]:
(8523, 12)
In [684]:
#Calculating the shape of X 
X.shape
Out[684]:
(8523, 11)
In [490]:
#Calulating the shape of Y
Y.shape
Out[490]:
(8523,)
In [491]:
!pip install pandas_profiling
Requirement already satisfied: pandas_profiling in c:\programdata\anaconda3\lib\site-packages (2.3.0)
Requirement already satisfied: pandas>=0.19 in c:\programdata\anaconda3\lib\site-packages (from pandas_profiling) (0.24.2)
Requirement already satisfied: htmlmin>=0.1.12 in c:\programdata\anaconda3\lib\site-packages (from pandas_profiling) (0.1.12)
Requirement already satisfied: confuse>=1.0.0 in c:\programdata\anaconda3\lib\site-packages (from pandas_profiling) (1.0.0)
Requirement already satisfied: astropy in c:\programdata\anaconda3\lib\site-packages (from pandas_profiling) (3.1.2)
Requirement already satisfied: matplotlib>=1.4 in c:\programdata\anaconda3\lib\site-packages (from pandas_profiling) (3.0.3)
Requirement already satisfied: missingno>=0.4.2 in c:\programdata\anaconda3\lib\site-packages (from pandas_profiling) (0.4.2)
Requirement already satisfied: phik>=0.9.8 in c:\programdata\anaconda3\lib\site-packages (from pandas_profiling) (0.9.8)
Requirement already satisfied: jinja2>=2.8 in c:\programdata\anaconda3\lib\site-packages (from pandas_profiling) (2.10)
Requirement already satisfied: pytz>=2011k in c:\programdata\anaconda3\lib\site-packages (from pandas>=0.19->pandas_profiling) (2018.9)
Requirement already satisfied: numpy>=1.12.0 in c:\programdata\anaconda3\lib\site-packages (from pandas>=0.19->pandas_profiling) (1.16.2)
Requirement already satisfied: python-dateutil>=2.5.0 in c:\programdata\anaconda3\lib\site-packages (from pandas>=0.19->pandas_profiling) (2.8.0)
Requirement already satisfied: pyyaml in c:\programdata\anaconda3\lib\site-packages (from confuse>=1.0.0->pandas_profiling) (5.1)
Requirement already satisfied: cycler>=0.10 in c:\programdata\anaconda3\lib\site-packages (from matplotlib>=1.4->pandas_profiling) (0.10.0)
Requirement already satisfied: kiwisolver>=1.0.1 in c:\programdata\anaconda3\lib\site-packages (from matplotlib>=1.4->pandas_profiling) (1.0.1)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in c:\programdata\anaconda3\lib\site-packages (from matplotlib>=1.4->pandas_profiling) (2.3.1)
Requirement already satisfied: seaborn in c:\programdata\anaconda3\lib\site-packages (from missingno>=0.4.2->pandas_profiling) (0.9.0)
Requirement already satisfied: scipy in c:\programdata\anaconda3\lib\site-packages (from missingno>=0.4.2->pandas_profiling) (1.2.1)
Requirement already satisfied: jupyter-client>=5.2.3 in c:\programdata\anaconda3\lib\site-packages (from phik>=0.9.8->pandas_profiling) (5.2.4)
Requirement already satisfied: pytest-pylint>=0.13.0 in c:\programdata\anaconda3\lib\site-packages (from phik>=0.9.8->pandas_profiling) (0.14.1)
Requirement already satisfied: numba>=0.38.1 in c:\programdata\anaconda3\lib\site-packages (from phik>=0.9.8->pandas_profiling) (0.43.1)
Requirement already satisfied: pytest>=4.0.2 in c:\programdata\anaconda3\lib\site-packages (from phik>=0.9.8->pandas_profiling) (4.3.1)
Requirement already satisfied: nbconvert>=5.3.1 in c:\programdata\anaconda3\lib\site-packages (from phik>=0.9.8->pandas_profiling) (5.4.1)
Requirement already satisfied: MarkupSafe>=0.23 in c:\programdata\anaconda3\lib\site-packages (from jinja2>=2.8->pandas_profiling) (1.1.1)
Requirement already satisfied: six>=1.5 in c:\programdata\anaconda3\lib\site-packages (from python-dateutil>=2.5.0->pandas>=0.19->pandas_profiling) (1.12.0)
Requirement already satisfied: setuptools in c:\programdata\anaconda3\lib\site-packages (from kiwisolver>=1.0.1->matplotlib>=1.4->pandas_profiling) (40.8.0)
Requirement already satisfied: traitlets in c:\programdata\anaconda3\lib\site-packages (from jupyter-client>=5.2.3->phik>=0.9.8->pandas_profiling) (4.3.2)
Requirement already satisfied: tornado>=4.1 in c:\programdata\anaconda3\lib\site-packages (from jupyter-client>=5.2.3->phik>=0.9.8->pandas_profiling) (6.0.2)
Requirement already satisfied: pyzmq>=13 in c:\programdata\anaconda3\lib\site-packages (from jupyter-client>=5.2.3->phik>=0.9.8->pandas_profiling) (18.0.0)
Requirement already satisfied: jupyter-core in c:\programdata\anaconda3\lib\site-packages (from jupyter-client>=5.2.3->phik>=0.9.8->pandas_profiling) (4.4.0)
Requirement already satisfied: pylint>=1.4.5 in c:\programdata\anaconda3\lib\site-packages (from pytest-pylint>=0.13.0->phik>=0.9.8->pandas_profiling) (2.3.1)
Requirement already satisfied: llvmlite>=0.28.0dev0 in c:\programdata\anaconda3\lib\site-packages (from numba>=0.38.1->phik>=0.9.8->pandas_profiling) (0.28.0)
Requirement already satisfied: py>=1.5.0 in c:\programdata\anaconda3\lib\site-packages (from pytest>=4.0.2->phik>=0.9.8->pandas_profiling) (1.8.0)
Requirement already satisfied: attrs>=17.4.0 in c:\programdata\anaconda3\lib\site-packages (from pytest>=4.0.2->phik>=0.9.8->pandas_profiling) (19.1.0)
Requirement already satisfied: atomicwrites>=1.0 in c:\programdata\anaconda3\lib\site-packages (from pytest>=4.0.2->phik>=0.9.8->pandas_profiling) (1.3.0)
Requirement already satisfied: pluggy>=0.7 in c:\programdata\anaconda3\lib\site-packages (from pytest>=4.0.2->phik>=0.9.8->pandas_profiling) (0.9.0)
Requirement already satisfied: more-itertools>=4.0.0 in c:\programdata\anaconda3\lib\site-packages (from pytest>=4.0.2->phik>=0.9.8->pandas_profiling) (6.0.0)
Requirement already satisfied: colorama in c:\programdata\anaconda3\lib\site-packages (from pytest>=4.0.2->phik>=0.9.8->pandas_profiling) (0.4.1)
Requirement already satisfied: mistune>=0.8.1 in c:\programdata\anaconda3\lib\site-packages (from nbconvert>=5.3.1->phik>=0.9.8->pandas_profiling) (0.8.4)
Requirement already satisfied: pygments in c:\programdata\anaconda3\lib\site-packages (from nbconvert>=5.3.1->phik>=0.9.8->pandas_profiling) (2.3.1)
Requirement already satisfied: nbformat>=4.4 in c:\programdata\anaconda3\lib\site-packages (from nbconvert>=5.3.1->phik>=0.9.8->pandas_profiling) (4.4.0)
Requirement already satisfied: entrypoints>=0.2.2 in c:\programdata\anaconda3\lib\site-packages (from nbconvert>=5.3.1->phik>=0.9.8->pandas_profiling) (0.3)
Requirement already satisfied: bleach in c:\programdata\anaconda3\lib\site-packages (from nbconvert>=5.3.1->phik>=0.9.8->pandas_profiling) (3.1.0)
Requirement already satisfied: pandocfilters>=1.4.1 in c:\programdata\anaconda3\lib\site-packages (from nbconvert>=5.3.1->phik>=0.9.8->pandas_profiling) (1.4.2)
Requirement already satisfied: testpath in c:\programdata\anaconda3\lib\site-packages (from nbconvert>=5.3.1->phik>=0.9.8->pandas_profiling) (0.4.2)
Requirement already satisfied: defusedxml in c:\programdata\anaconda3\lib\site-packages (from nbconvert>=5.3.1->phik>=0.9.8->pandas_profiling) (0.5.0)
Requirement already satisfied: ipython-genutils in c:\programdata\anaconda3\lib\site-packages (from traitlets->jupyter-client>=5.2.3->phik>=0.9.8->pandas_profiling) (0.2.0)
Requirement already satisfied: decorator in c:\programdata\anaconda3\lib\site-packages (from traitlets->jupyter-client>=5.2.3->phik>=0.9.8->pandas_profiling) (4.4.0)
Requirement already satisfied: astroid<3,>=2.2.0 in c:\programdata\anaconda3\lib\site-packages (from pylint>=1.4.5->pytest-pylint>=0.13.0->phik>=0.9.8->pandas_profiling) (2.2.5)
Requirement already satisfied: isort<5,>=4.2.5 in c:\programdata\anaconda3\lib\site-packages (from pylint>=1.4.5->pytest-pylint>=0.13.0->phik>=0.9.8->pandas_profiling) (4.3.16)
Requirement already satisfied: mccabe<0.7,>=0.6 in c:\programdata\anaconda3\lib\site-packages (from pylint>=1.4.5->pytest-pylint>=0.13.0->phik>=0.9.8->pandas_profiling) (0.6.1)
Requirement already satisfied: jsonschema!=2.5.0,>=2.4 in c:\programdata\anaconda3\lib\site-packages (from nbformat>=4.4->nbconvert>=5.3.1->phik>=0.9.8->pandas_profiling) (3.0.1)
Requirement already satisfied: webencodings in c:\programdata\anaconda3\lib\site-packages (from bleach->nbconvert>=5.3.1->phik>=0.9.8->pandas_profiling) (0.5.1)
Requirement already satisfied: lazy-object-proxy in c:\programdata\anaconda3\lib\site-packages (from astroid<3,>=2.2.0->pylint>=1.4.5->pytest-pylint>=0.13.0->phik>=0.9.8->pandas_profiling) (1.3.1)
Requirement already satisfied: wrapt in c:\programdata\anaconda3\lib\site-packages (from astroid<3,>=2.2.0->pylint>=1.4.5->pytest-pylint>=0.13.0->phik>=0.9.8->pandas_profiling) (1.11.1)
Requirement already satisfied: typed-ast>=1.3.0; implementation_name == "cpython" in c:\programdata\anaconda3\lib\site-packages (from astroid<3,>=2.2.0->pylint>=1.4.5->pytest-pylint>=0.13.0->phik>=0.9.8->pandas_profiling) (1.4.0)
Requirement already satisfied: pyrsistent>=0.14.0 in c:\programdata\anaconda3\lib\site-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.4->nbconvert>=5.3.1->phik>=0.9.8->pandas_profiling) (0.14.11)
In [700]:
# Creating a new dataset with only numeric values
numeric_var_names=[key for key in dict(train.dtypes) if dict(train.dtypes)[key] in ['float64', 'int64', 'float32', 'int32']]
cat_var_names=[key for key in dict(train.dtypes) if dict(train.dtypes)[key] in ['object', 'O']]
print(numeric_var_names)
print(cat_var_names)
['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_Establishment_Year', 'Item_Outlet_Sales']
['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']
In [701]:
train_num=train[numeric_var_names]
train_num.head(5)
Out[701]:
Item_Weight Item_Visibility Item_MRP Outlet_Establishment_Year Item_Outlet_Sales
0 9.30 0.016047 249.8092 1999.0 3735.1380
1 5.92 0.019278 48.2692 2009.0 443.4228
2 17.50 0.016760 141.6180 1999.0 2097.2700
3 19.20 0.000000 182.0950 1998.0 732.3800
4 8.93 0.000000 53.8614 1987.0 994.7052
In [702]:
train_cat=train[cat_var_names]
train_cat.head(5)
Out[702]:
Item_Identifier Item_Fat_Content Item_Type Outlet_Identifier Outlet_Size Outlet_Location_Type Outlet_Type
0 FDA15 Low Fat Dairy OUT049 Medium Tier 1 Supermarket Type1
1 DRC01 Regular Soft Drinks OUT018 Medium Tier 3 Supermarket Type2
2 FDN15 Low Fat Meat OUT049 Medium Tier 1 Supermarket Type1
3 FDX07 Regular Fruits and Vegetables OUT010 NaN Tier 3 Grocery Store
4 NCD19 Low Fat Household OUT013 High Tier 3 Supermarket Type1
In [703]:
# Getting Quick Summary
def quick_summary(x):
    return pd.Series([x.count(), x.isnull().sum(), x.value_counts()], 
                  index=['N', 'NMISS', 'ColumnsNames'])

num_summary=train_num.apply(lambda x: quick_summary(x))
num_summary
Out[703]:
Item_Weight Item_Visibility Item_MRP Outlet_Establishment_Year Item_Outlet_Sales
N 7060 8523 8523 8523 8523
NMISS 1463 0 0 0 0
ColumnsNames 12.150 86 17.600 82 13.650 77 11.800 ... 0.000000 526 0.195979 144 0.076975 ... 172.0422 7 188.1872 6 170.5422 6 109.... 1985.0 1463 1987.0 932 2004.0 930 1... 6501.8699 186 958.7520 17 1342.2528 ...
In [704]:
train_num.describe()
Out[704]:
Item_Weight Item_Visibility Item_MRP Outlet_Establishment_Year Item_Outlet_Sales
count 7060.000000 8523.000000 8523.000000 8523.000000 8523.000000
mean 12.857645 0.065307 140.992782 1997.831867 2156.313016
std 4.643456 0.048841 62.275067 8.371760 1624.863069
min 4.555000 0.000000 31.290000 1985.000000 33.290000
25% 8.773750 0.026989 93.826500 1987.000000 834.247400
50% 12.600000 0.053931 143.012800 1999.000000 1794.331000
75% 16.850000 0.094585 185.643700 2004.000000 3101.296400
max 21.350000 0.195979 266.888400 2009.000000 6501.869900
In [705]:
bp = PdfPages('x.pdf')
for num_variable in numeric_var_names:
    fig,axes = plt.subplots(figsize=(10,4))
    sns.boxplot( x=num_variable, data = train_num)
    plt.title(str('Box Plot of ') + str(num_variable))
    bp.savefig(fig)
bp.close()

Treatment of Skewness

In [706]:
train.skew()
Out[706]:
Item_Weight                  0.082426
Item_Visibility              0.813980
Item_MRP                     0.127202
Outlet_Establishment_Year   -0.396641
Item_Outlet_Sales            0.874606
dtype: float64
In [1423]:
"""This skewness of data needs to be removed by applying transformations"""
import numpy as np
train['Item_Weight'] = np.log(train['Item_Weight'])
train['Item_Weight'] = np.sqrt(train['Item_Weight'])
train['Item_Visibility'] = np.sqrt(train['Item_Visibility'])
train['Item_MRP'] = np.sqrt(train['Item_MRP'])
train['Outlet_Establishment_Year'] = np.square(train['Outlet_Establishment_Year'])
train['Item_Outlet_Sales'] = np.sqrt(train['Item_Outlet_Sales'])
train.skew()
train.shape
Out[1423]:
(8523, 12)
In [ ]:
"""Skewness is removed after treatment of skewness. Skewness can cause outliers thus it is important that outliers must get 
removed"""

Treatment of Outliers

In [736]:
def outlier_capping(x):
    ul = x.quantile(0.75)+1.5*(x.quantile(0.75)-x.quantile(0.25))
    ll = x.quantile(0.25)-1.5*(x.quantile(0.75)-x.quantile(0.25))
    x = x.clip_upper(ul)
    x = x.clip_lower(ll)
    return x

train_num=train_num.apply(lambda x: outlier_capping(x))
train_num.describe()
train.update(train_num)
train.head()
"""Outliers in X and Y both have been removed."""
Out[736]:
Item_Identifier Item_Weight Item_Fat_Content Item_Visibility Item_Type Item_MRP Outlet_Identifier Outlet_Establishment_Year Outlet_Size Outlet_Location_Type Outlet_Type Item_Outlet_Sales
0 FDA15 9.30 Low Fat 0.016047 Dairy 249.8092 OUT049 1999.0 Medium Tier 1 Supermarket Type1 3735.1380
1 DRC01 5.92 Regular 0.019278 Soft Drinks 48.2692 OUT018 2009.0 Medium Tier 3 Supermarket Type2 443.4228
2 FDN15 17.50 Low Fat 0.016760 Meat 141.6180 OUT049 1999.0 Medium Tier 1 Supermarket Type1 2097.2700
3 FDX07 19.20 Regular 0.000000 Fruits and Vegetables 182.0950 OUT010 1998.0 Medium Tier 3 Grocery Store 732.3800
4 NCD19 8.93 Low Fat 0.000000 Household 53.8614 OUT013 1987.0 High Tier 3 Supermarket Type1 994.7052

Plotting of Boxplot i.e. Categorical Data Vs Categorical Data

In [739]:
x = train[["Item_Weight","Item_Visibility"]].values
plt.boxplot(x)
plt.show()
In [740]:
y=train[["Item_MRP","Outlet_Establishment_Year","Item_Outlet_Sales"]].values
plt.boxplot(y)
plt.show()
In [504]:
import pandas_profiling as pd_prof
pd_prof.ProfileReport(train)
Out[504]:

In [711]:
pd_prof.ProfileReport(train_cat)
Out[711]:

In [507]:
pd_prof.ProfileReport(train_num)
Out[507]:

In [741]:
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
Item_Identifier              8523 non-null object
Item_Weight                  8523 non-null float64
Item_Fat_Content             8523 non-null object
Item_Visibility              8523 non-null float64
Item_Type                    8523 non-null object
Item_MRP                     8523 non-null float64
Outlet_Identifier            8523 non-null object
Outlet_Establishment_Year    8523 non-null float64
Outlet_Size                  8523 non-null object
Outlet_Location_Type         8523 non-null object
Outlet_Type                  8523 non-null object
Item_Outlet_Sales            8523 non-null float64
dtypes: float64(5), object(7)
memory usage: 799.1+ KB
In [742]:
X.update(train)
X.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 11 columns):
Item_Identifier              8523 non-null object
Item_Weight                  8523 non-null float64
Item_Fat_Content             8523 non-null object
Item_Visibility              8523 non-null float64
Item_Type                    8523 non-null object
Item_MRP                     8523 non-null float64
Outlet_Identifier            8523 non-null object
Outlet_Establishment_Year    8523 non-null float64
Outlet_Size                  8523 non-null object
Outlet_Location_Type         8523 non-null object
Outlet_Type                  8523 non-null object
dtypes: float64(4), object(7)
memory usage: 732.5+ KB
In [743]:
train.describe()
Out[743]:
Item_Weight Item_Visibility Item_MRP Outlet_Establishment_Year Item_Outlet_Sales
count 8523.000000 8523.000000 8523.000000 8523.000000 8523.000000
mean 12.857645 0.065307 140.992782 1997.831867 2156.313016
std 4.226124 0.048841 62.275067 8.371760 1624.863069
min 4.555000 0.000000 31.290000 1985.000000 33.290000
25% 9.310000 0.026989 93.826500 1987.000000 834.247400
50% 12.857645 0.053931 143.012800 1999.000000 1794.331000
75% 16.000000 0.094585 185.643700 2004.000000 3101.296400
max 21.350000 0.195979 266.888400 2009.000000 6501.869900
In [744]:
'''missing value imputation of numerical column "Item_Weight by mean"'''
mean=np.mean(train["Item_Weight"])
train["Item_Weight"].fillna(mean,inplace=True)
train.head()
Out[744]:
Item_Identifier Item_Weight Item_Fat_Content Item_Visibility Item_Type Item_MRP Outlet_Identifier Outlet_Establishment_Year Outlet_Size Outlet_Location_Type Outlet_Type Item_Outlet_Sales
0 FDA15 9.30 Low Fat 0.016047 Dairy 249.8092 OUT049 1999.0 Medium Tier 1 Supermarket Type1 3735.1380
1 DRC01 5.92 Regular 0.019278 Soft Drinks 48.2692 OUT018 2009.0 Medium Tier 3 Supermarket Type2 443.4228
2 FDN15 17.50 Low Fat 0.016760 Meat 141.6180 OUT049 1999.0 Medium Tier 1 Supermarket Type1 2097.2700
3 FDX07 19.20 Regular 0.000000 Fruits and Vegetables 182.0950 OUT010 1998.0 Medium Tier 3 Grocery Store 732.3800
4 NCD19 8.93 Low Fat 0.000000 Household 53.8614 OUT013 1987.0 High Tier 3 Supermarket Type1 994.7052
In [1143]:
'''missing value imputation of categorical column "Item_Size" by mode'''
train["Outlet_Size"].mode()
train["Outlet_Size"].fillna("Medium",inplace=True)
train["Outlet_Size"].head()
train.head()
Out[1143]:
Item_Identifier Item_Weight Item_Fat_Content Item_Visibility Item_Type Item_MRP Outlet_Identifier Outlet_Establishment_Year Outlet_Size Outlet_Location_Type Outlet_Type Item_Outlet_Sales
0 FDA15 9.300 Low Fat 0.016 Dairy 249.809 OUT049 1999.000 Medium Tier 1 Supermarket Type1 3735.138
1 DRC01 5.920 Regular 0.019 Soft Drinks 48.269 OUT018 2009.000 Medium Tier 3 Supermarket Type2 443.423
2 FDN15 17.500 Low Fat 0.017 Meat 141.618 OUT049 1999.000 Medium Tier 1 Supermarket Type1 2097.270
3 FDX07 19.200 Regular 0.000 Fruits and Vegetables 182.095 OUT010 1998.000 Medium Tier 3 Grocery Store 732.380
4 NCD19 8.930 Low Fat 0.000 Household 53.861 OUT013 1987.000 High Tier 3 Supermarket Type1 994.705
In [1514]:
X=train.iloc[:,:-1]
Y=train["Item_Outlet_Sales"]
X.drop(["Item_Identifier"],axis=1)

    
Out[1514]:
Item_Weight Item_Fat_Content Item_Visibility Item_Type Item_MRP Outlet_Identifier Outlet_Establishment_Year Outlet_Size Outlet_Location_Type Outlet_Type
0 1.493 Low Fat 0.127 Dairy 15.805 OUT049 3996001.000 Medium Tier 1 Supermarket Type1
1 1.334 Regular 0.139 Soft Drinks 6.948 OUT018 4036081.000 Medium Tier 3 Supermarket Type2
2 1.692 Low Fat 0.129 Meat 11.900 OUT049 3996001.000 Medium Tier 1 Supermarket Type1
3 1.719 Regular 0.000 Fruits and Vegetables 13.494 OUT010 3992004.000 Medium Tier 3 Grocery Store
4 1.480 Low Fat 0.000 Household 7.339 OUT013 3948169.000 High Tier 3 Supermarket Type1
5 1.530 Regular 0.000 Baking Goods 7.169 OUT018 4036081.000 Medium Tier 3 Supermarket Type2
6 1.617 Regular 0.113 Snack Foods 7.593 OUT013 3948169.000 High Tier 3 Supermarket Type1
7 1.598 Low Fat 0.357 Snack Foods 10.381 OUT027 3940225.000 Medium Tier 3 Supermarket Type3
8 1.669 Regular 0.129 Frozen Foods 9.847 OUT045 4008004.000 Medium Tier 2 Supermarket Type1
9 1.719 Regular 0.307 Frozen Foods 13.705 OUT017 4028049.000 Medium Tier 2 Supermarket Type1
10 1.571 Low Fat 0.000 Fruits and Vegetables 6.748 OUT049 3996001.000 Medium Tier 1 Supermarket Type1
11 1.708 Regular 0.213 Dairy 12.005 OUT046 3988009.000 Small Tier 1 Supermarket Type1
12 1.648 Regular 0.316 Fruits and Vegetables 12.061 OUT049 3996001.000 Medium Tier 1 Supermarket Type1
13 1.693 Regular 0.217 Snack Foods 10.940 OUT046 3988009.000 Small Tier 1 Supermarket Type1
14 1.672 Low Fat 0.261 Fruits and Vegetables 14.016 OUT013 3948169.000 High Tier 3 Supermarket Type1
15 1.482 Regular 0.263 Breakfast 7.507 OUT046 3988009.000 Small Tier 1 Supermarket Type1
16 1.571 Low Fat 0.093 Health and Hygiene 10.740 OUT018 4036081.000 Medium Tier 3 Supermarket Type2
17 1.482 Regular 0.263 Breakfast 7.373 OUT049 3996001.000 Medium Tier 1 Supermarket Type1
18 1.598 Low Fat 0.185 Hard Drinks 10.643 OUT027 3940225.000 Medium Tier 3 Supermarket Type3
19 1.610 Low Fat 0.320 Dairy 15.183 OUT035 4016016.000 Small Tier 2 Supermarket Type1
20 1.714 Regular 0.372 Snack Foods 15.839 OUT013 3948169.000 High Tier 3 Supermarket Type1
21 1.598 Regular 0.188 Baking Goods 12.023 OUT027 3940225.000 Medium Tier 3 Supermarket Type3
22 1.637 Low Fat 0.160 Household 14.018 OUT035 4016016.000 Small Tier 2 Supermarket Type1
23 1.598 Low Fat 0.240 Baking Goods 10.378 OUT019 3940225.000 Small Tier 1 Grocery Store
24 1.621 Regular 0.161 Frozen Foods 12.846 OUT046 3988009.000 Small Tier 1 Supermarket Type1
25 1.602 Low Fat 0.316 Household 6.775 OUT017 4028049.000 Medium Tier 2 Supermarket Type1
26 1.426 Regular 0.258 Snack Foods 6.505 OUT035 4016016.000 Small Tier 2 Supermarket Type1
27 1.567 low fat 0.139 Hard Drinks 6.254 OUT013 3948169.000 High Tier 3 Supermarket Type1
28 1.334 Regular 0.402 Dairy 6.746 OUT010 3992004.000 Medium Tier 3 Grocery Store
29 1.598 Regular 0.269 Canned 6.606 OUT019 3940225.000 Small Tier 1 Grocery Store
... ... ... ... ... ... ... ... ... ... ...
8493 1.416 Regular 0.161 Snack Foods 13.755 OUT017 4028049.000 Medium Tier 2 Supermarket Type1
8494 1.650 Low Fat 0.000 Household 10.511 OUT017 4028049.000 Medium Tier 2 Supermarket Type1
8495 1.507 Low Fat 0.172 Snack Foods 12.669 OUT035 4016016.000 Small Tier 2 Supermarket Type1
8496 1.416 Regular 0.147 Seafood 13.615 OUT017 4028049.000 Medium Tier 2 Supermarket Type1
8497 1.617 Low Fat 0.222 Fruits and Vegetables 12.248 OUT035 4016016.000 Small Tier 2 Supermarket Type1
8498 1.653 Low Fat 0.188 Household 11.161 OUT018 4036081.000 Medium Tier 3 Supermarket Type2
8499 1.566 Low Fat 0.194 Health and Hygiene 10.000 OUT035 4016016.000 Small Tier 2 Supermarket Type1
8500 1.736 Low Fat 0.000 Household 11.188 OUT017 4028049.000 Medium Tier 2 Supermarket Type1
8501 1.294 Regular 0.077 Snack Foods 10.022 OUT017 4028049.000 Medium Tier 2 Supermarket Type1
8502 1.460 Low Fat 0.266 Household 14.711 OUT045 4008004.000 Medium Tier 2 Supermarket Type1
8503 1.738 Low Fat 0.190 Fruits and Vegetables 10.962 OUT035 4016016.000 Small Tier 2 Supermarket Type1
8504 1.598 Low Fat 0.352 Household 10.571 OUT027 3940225.000 Medium Tier 3 Supermarket Type3
8505 1.533 Regular 0.307 Snack Foods 14.520 OUT018 4036081.000 Medium Tier 3 Supermarket Type2
8506 1.688 Low Fat 0.291 Soft Drinks 16.223 OUT018 4036081.000 Medium Tier 3 Supermarket Type2
8507 1.331 Regular 0.174 Frozen Foods 10.090 OUT035 4016016.000 Small Tier 2 Supermarket Type1
8508 1.559 Regular 0.208 Fruits and Vegetables 14.124 OUT045 4008004.000 Medium Tier 2 Supermarket Type1
8509 1.446 Low Fat 0.443 Fruits and Vegetables 14.628 OUT010 3992004.000 Medium Tier 3 Grocery Store
8510 1.620 Regular 0.238 Snack Foods 15.218 OUT035 4016016.000 Small Tier 2 Supermarket Type1
8511 1.692 Low Fat 0.164 Frozen Foods 16.205 OUT018 4036081.000 Medium Tier 3 Supermarket Type2
8512 1.741 Low Fat 0.207 Dairy 13.353 OUT013 3948169.000 High Tier 3 Supermarket Type1
8513 1.576 Regular 0.143 Meat 9.995 OUT035 4016016.000 Small Tier 2 Supermarket Type1
8514 1.646 Regular 0.233 Canned 7.589 OUT045 4008004.000 Medium Tier 2 Supermarket Type1
8515 1.741 Low Fat 0.147 Baking Goods 12.551 OUT018 4036081.000 Medium Tier 3 Supermarket Type2
8516 1.710 Low Fat 0.344 Others 7.665 OUT018 4036081.000 Medium Tier 3 Supermarket Type2
8517 1.741 reg 0.289 Frozen Foods 13.373 OUT046 3988009.000 Small Tier 1 Supermarket Type1
8518 1.388 Low Fat 0.238 Snack Foods 14.647 OUT013 3948169.000 High Tier 3 Supermarket Type1
8519 1.458 Regular 0.217 Baking Goods 10.400 OUT045 4008004.000 Medium Tier 2 Supermarket Type1
8520 1.537 Low Fat 0.188 Health and Hygiene 9.226 OUT035 4016016.000 Small Tier 2 Supermarket Type1
8521 1.406 Regular 0.381 Snack Foods 10.155 OUT018 4036081.000 Medium Tier 3 Supermarket Type2
8522 1.642 Low Fat 0.212 Soft Drinks 8.687 OUT046 3988009.000 Small Tier 1 Supermarket Type1

8523 rows × 10 columns

TREATMENT OF DATA

1. As number of values in the each column of train dataset is too high, it causes a danger of "Curse of Dimensionality" which in turn impacts "Accuracy Level" henceforth we should use "Label Encoding" in columns which have many columns and one hot encoding or get_dummies in columns which have less number of columns.

2. Also one-hot encoding is performed on nominal data and label-encoding is performed on ordinal data.

3. One-Hot Encoding or get_dummies for X

4. After concating all the dummy variables with the original dataframe, we need to drop the column from which dummy variable has been created and also one dummy variable column because "Multicollinearity" condition" is created,whichinduces "Dummy Variable Trap" which can crash computers and also affect the accuracy to an undesirable extent.

In [1512]:
x_dummies=pd.get_dummies(X[["Item_Type","Outlet_Identifier"]]) #Use of get_dummies instead of One-Hot Encoding
x_dummies.head()
le=LabelEncoder()
X["Outlet_Size"]=le.fit_transform(X["Outlet_Size"])
X["Outlet_Location_Type"]=le.fit_transform(X["Outlet_Location_Type"])
X["Outlet_Type"]=le.fit_transform(X["Outlet_Type"])
X["Item_Fat_Content"]=le.fit_transform(X["Item_Fat_Content"])
x_dummies=pd.get_dummies(X)
x_dummies.head()
Out[1512]:
Item_Weight Item_Fat_Content Item_Visibility Item_MRP Outlet_Establishment_Year Outlet_Size Outlet_Location_Type Outlet_Type Item_Type_Baking Goods Item_Type_Breads ... Outlet_Identifier_OUT010 Outlet_Identifier_OUT013 Outlet_Identifier_OUT017 Outlet_Identifier_OUT018 Outlet_Identifier_OUT019 Outlet_Identifier_OUT027 Outlet_Identifier_OUT035 Outlet_Identifier_OUT045 Outlet_Identifier_OUT046 Outlet_Identifier_OUT049
0 9.300 1 0.016 249.809 1999.000 1 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 1
1 5.920 2 0.019 48.269 2009.000 1 2 2 0 0 ... 0 0 0 1 0 0 0 0 0 0
2 17.500 1 0.017 141.618 1999.000 1 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 1
3 19.200 2 0.000 182.095 1998.000 1 2 0 0 0 ... 1 0 0 0 0 0 0 0 0 0
4 8.930 1 0.000 53.861 1987.000 0 2 1 0 0 ... 0 1 0 0 0 0 0 0 0 0

5 rows × 34 columns

In [1511]:
x_dummies.skew()
Out[1511]:
Item_Weight                        0.091
Item_Fat_Content                   0.995
Item_Visibility                    0.814
Item_MRP                           0.127
Outlet_Establishment_Year         -0.397
Outlet_Size                       -0.087
Outlet_Location_Type              -0.209
Outlet_Type                        0.927
Item_Type_Baking Goods             3.200
Item_Type_Breads                   5.568
Item_Type_Breakfast                8.633
Item_Type_Canned                   3.197
Item_Type_Dairy                    3.096
Item_Type_Frozen Foods             2.659
Item_Type_Fruits and Vegetables    2.022
Item_Type_Hard Drinks              6.072
Item_Type_Health and Hygiene       3.669
Item_Type_Household                2.547
Item_Type_Meat                     4.137
Item_Type_Others                   6.890
Item_Type_Seafood                 11.412
Item_Type_Snack Foods              2.066
Item_Type_Soft Drinks              4.027
Item_Type_Starchy Foods            7.391
Outlet_Identifier_OUT010           3.526
Outlet_Identifier_OUT013           2.504
Outlet_Identifier_OUT017           2.516
Outlet_Identifier_OUT018           2.512
Outlet_Identifier_OUT019           3.635
Outlet_Identifier_OUT027           2.498
Outlet_Identifier_OUT035           2.508
Outlet_Identifier_OUT045           2.510
Outlet_Identifier_OUT046           2.508
Outlet_Identifier_OUT049           2.508
dtype: float64

Analysis of Data

Univariate Analysis

In [1516]:
# Lets check if the Item_Weight and Item_Visibility is normally distributed
# Lets first check overall Item_Weight and Item_Visibility
# Item_Weight distribution
from matplotlib import pyplot as plt
plt.hist(x_dummies['Item_Weight'])
plt.xlabel('Item_Weight')
plt.ylabel('Number of Items')
plt.show()
plt.hist(x_dummies['Item_Visibility'])
plt.xlabel('Item_Visibility')
plt.ylabel('Number of Items')
plt.show()
plt.hist(x_dummies['Item_MRP'])
plt.xlabel('Item_MRP')
plt.ylabel('Number of Items')
plt.show()
plt.hist(x_dummies['Outlet_Establishment_Year'])
plt.xlabel('Outlet_Establishment_Year')
plt.ylabel('Number of Items')
plt.show()

None of the data in the numerical columns or feature columns is normally distributed. Thus, to make this data normally distributed, we have to apply standard scaler normalization.

Bivariate Analysis

Numerical column vs numerical column :

Analysis that can be performed :

  • Relationship analysis : Scatter plot
  • Correlation analysis : Correlation matrix and plot
  • Regression analysis : Simple linear regression (SLR)

Relationship analysis- Scatter Plot

In [1148]:
plt.scatter(x_dummies["Item_Weight"],x_dummies["Item_Visibility"],color='violet')
plt.xlabel("Item_Weight")
plt.ylabel("Item_Visibility")
plt.title("Item_Weight Vs Item_Visibility")
plt.show()
In [1149]:
plt.scatter(x_dummies["Item_MRP"],x_dummies["Outlet_Establishment_Year"],color='orange')
plt.xlabel("Item_MRP")
plt.ylabel("Outlet_Establishment_Year")
plt.title("Item_MRP Vs Outlet_Establishment_Year")
plt.show()

Correlation Plot- Correlation Matrix and Plot

In [1151]:
matrix=x_dummies.corr()
matrix
Out[1151]:
Item_Weight Item_Fat_Content Item_Visibility Item_MRP Outlet_Establishment_Year Outlet_Size Outlet_Location_Type Outlet_Type Item_Identifier_DRA12 Item_Identifier_DRA24 ... Outlet_Identifier_OUT010 Outlet_Identifier_OUT013 Outlet_Identifier_OUT017 Outlet_Identifier_OUT018 Outlet_Identifier_OUT019 Outlet_Identifier_OUT027 Outlet_Identifier_OUT035 Outlet_Identifier_OUT045 Outlet_Identifier_OUT046 Outlet_Identifier_OUT049
Item_Weight 1.000 -0.017 -0.012 0.025 -0.008 -0.007 0.004 -0.001 -0.008 0.031 ... 0.003 0.012 -0.003 0.001 0.000 0.000 -0.002 -0.017 0.001 0.005
Item_Fat_Content -0.017 1.000 0.037 -0.001 -0.000 -0.012 -0.004 -0.004 -0.022 0.028 ... -0.005 0.007 0.009 -0.000 0.003 -0.006 -0.011 -0.001 -0.007 0.012
Item_Visibility -0.012 0.037 1.000 -0.001 -0.064 0.062 -0.024 -0.158 -0.018 -0.010 ... 0.167 -0.038 -0.028 -0.031 0.186 -0.048 -0.029 -0.035 -0.035 -0.032
Item_MRP 0.025 -0.001 -0.001 1.000 0.005 0.006 0.000 -0.002 0.000 0.011 ... -0.001 0.002 -0.009 0.004 -0.005 -0.007 0.012 -0.000 0.006 -0.004
Outlet_Establishment_Year -0.008 -0.000 -0.064 0.005 1.000 0.193 -0.089 -0.122 0.011 -0.010 ... 0.005 -0.453 0.382 0.466 -0.394 -0.538 0.258 0.174 -0.035 0.049
Outlet_Size -0.007 -0.012 0.062 0.006 0.193 1.000 -0.614 -0.201 -0.008 -0.001 ... -0.075 -0.683 -0.099 -0.099 0.355 -0.100 0.483 -0.100 0.483 -0.100
Outlet_Location_Type 0.004 -0.004 -0.024 0.000 -0.089 -0.614 1.000 0.467 0.013 0.001 ... 0.288 0.382 -0.048 0.382 -0.352 0.383 -0.049 -0.049 -0.479 -0.479
Outlet_Type -0.001 -0.004 -0.158 -0.002 -0.122 -0.201 0.467 1.000 -0.007 -0.007 ... -0.398 -0.089 -0.088 0.351 -0.388 0.793 -0.088 -0.088 -0.088 -0.088
Item_Identifier_DRA12 -0.008 -0.022 -0.018 0.000 0.011 -0.008 0.013 -0.007 1.000 -0.001 ... 0.011 0.005 0.005 0.005 -0.007 -0.009 0.005 0.005 -0.009 -0.009
Item_Identifier_DRA24 0.031 0.028 -0.010 0.011 -0.010 -0.001 0.001 -0.007 -0.001 1.000 ... 0.009 0.003 0.003 -0.010 0.010 0.003 0.003 -0.010 -0.010 0.003
Item_Identifier_DRA59 -0.025 0.030 0.040 0.022 -0.007 -0.002 0.000 -0.003 -0.001 -0.001 ... 0.007 0.002 0.002 0.002 0.008 0.002 -0.011 -0.011 0.002 0.002
Item_Identifier_DRB01 -0.016 -0.011 0.006 0.015 -0.017 -0.016 0.005 0.011 -0.000 -0.001 ... -0.005 0.013 -0.007 -0.007 -0.005 0.013 -0.007 -0.007 -0.007 0.013
Item_Identifier_DRB13 -0.039 0.024 -0.028 0.019 0.003 -0.007 0.003 -0.012 -0.001 -0.001 ... 0.013 0.007 0.007 -0.008 -0.006 -0.009 0.007 -0.008 -0.008 0.007
Item_Identifier_DRB24 -0.021 -0.012 -0.020 0.005 0.013 0.003 -0.010 -0.005 -0.001 -0.001 ... -0.006 -0.008 0.010 -0.008 -0.006 -0.008 0.010 0.010 -0.008 0.010
Item_Identifier_DRB25 -0.003 -0.015 0.007 -0.015 0.002 -0.008 0.018 0.004 -0.001 -0.001 ... 0.011 0.005 0.005 0.005 -0.007 0.005 0.005 -0.009 -0.009 -0.009
Item_Identifier_DRB48 0.023 0.028 -0.024 -0.047 -0.004 -0.001 0.001 -0.002 -0.001 -0.001 ... 0.009 0.003 0.003 -0.010 -0.007 0.003 0.003 -0.010 0.003 0.003
Item_Identifier_DRC01 -0.036 0.026 -0.025 -0.039 0.000 -0.008 0.007 0.010 -0.001 -0.001 ... -0.007 0.005 0.005 0.005 -0.007 0.005 -0.009 0.005 0.005 -0.009
Item_Identifier_DRC12 0.026 -0.012 -0.012 0.017 0.010 0.012 -0.016 -0.005 -0.001 -0.001 ... -0.006 -0.008 0.010 -0.008 -0.006 -0.008 0.010 -0.008 0.010 0.010
Item_Identifier_DRC13 -0.024 0.021 -0.018 -0.006 0.020 0.003 0.004 0.001 -0.001 -0.001 ... -0.006 -0.008 0.010 0.010 -0.006 -0.008 0.010 0.010 -0.008 -0.008
Item_Identifier_DRC24 0.018 -0.009 -0.013 0.003 0.019 -0.004 0.007 0.006 -0.000 -0.000 ... -0.004 -0.005 0.019 0.019 -0.004 -0.005 -0.005 -0.005 -0.005 -0.005
Item_Identifier_DRC25 -0.035 -0.016 -0.010 -0.025 -0.011 -0.001 0.006 -0.002 -0.001 -0.001 ... 0.009 0.003 -0.010 0.003 0.010 0.003 -0.010 0.003 0.003 -0.010
Item_Identifier_DRC27 0.006 -0.015 -0.000 0.045 0.017 -0.000 0.002 -0.007 -0.001 -0.001 ... 0.011 -0.009 0.005 0.005 -0.007 -0.009 0.005 0.005 -0.009 0.005
Item_Identifier_DRC36 0.001 0.039 -0.015 0.013 -0.005 0.001 0.003 0.006 -0.001 -0.001 ... -0.006 0.007 0.007 -0.008 -0.006 0.007 0.007 -0.008 0.007 -0.008
Item_Identifier_DRC49 -0.012 -0.020 0.000 0.001 -0.006 0.015 -0.003 0.011 -0.000 -0.001 ... -0.005 -0.007 -0.007 -0.007 -0.005 0.013 0.013 -0.007 0.013 -0.007
Item_Identifier_DRD01 -0.004 0.024 -0.002 -0.033 0.018 0.001 -0.003 -0.000 -0.001 -0.001 ... -0.006 -0.008 0.007 0.007 -0.006 -0.009 0.007 0.007 -0.008 0.007
Item_Identifier_DRD12 -0.034 -0.023 0.001 -0.023 0.003 -0.001 0.006 0.008 -0.001 -0.001 ... -0.008 0.003 0.003 0.003 -0.007 0.003 0.003 0.003 0.003 -0.010
Item_Identifier_DRD13 0.012 -0.016 -0.009 -0.035 0.004 -0.008 0.006 0.008 -0.001 -0.001 ... -0.008 0.003 0.003 0.003 -0.007 0.003 0.003 0.003 -0.010 0.003
Item_Identifier_DRD15 -0.013 -0.016 0.002 0.042 -0.006 0.005 -0.009 -0.018 -0.001 -0.001 ... 0.009 0.003 -0.010 -0.010 0.010 -0.010 0.003 0.003 0.003 0.003
Item_Identifier_DRD24 0.006 -0.016 -0.019 0.000 -0.001 -0.008 0.006 0.003 -0.001 -0.001 ... 0.009 0.003 0.003 0.003 -0.007 0.003 -0.010 -0.010 0.003 0.003
Item_Identifier_DRD25 -0.037 -0.024 0.011 -0.014 -0.002 0.010 -0.004 -0.003 -0.001 -0.001 ... 0.007 -0.011 -0.011 0.002 0.008 0.002 0.002 0.002 0.002 0.002
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Item_Identifier_NCZ41 0.040 -0.014 -0.007 -0.006 0.011 -0.007 0.009 -0.000 -0.001 -0.001 ... -0.006 0.007 0.007 0.007 -0.006 -0.009 0.007 0.007 -0.008 -0.008
Item_Identifier_NCZ42 -0.014 -0.021 -0.028 0.037 0.003 -0.007 0.003 -0.012 -0.001 -0.001 ... 0.013 0.007 0.007 -0.008 -0.006 -0.009 0.007 -0.008 -0.008 0.007
Item_Identifier_NCZ53 -0.015 -0.014 -0.019 0.018 -0.006 -0.015 0.015 0.006 -0.001 -0.001 ... 0.013 0.007 -0.008 0.007 -0.006 0.007 -0.008 -0.008 -0.008 0.007
Item_Identifier_NCZ54 0.009 -0.016 0.009 0.010 -0.005 0.005 -0.004 -0.002 -0.001 -0.001 ... 0.009 -0.010 -0.010 0.003 0.010 0.003 -0.010 0.003 0.003 0.003
Item_Type_Baking Goods -0.033 0.070 0.017 -0.067 -0.004 0.002 -0.003 -0.005 -0.008 -0.008 ... -0.000 0.003 0.004 -0.004 0.005 -0.003 -0.004 -0.001 0.008 -0.007
Item_Type_Breads -0.051 0.030 0.001 -0.000 -0.004 0.004 0.004 0.005 -0.005 -0.005 ... 0.002 -0.005 -0.012 -0.001 0.001 0.008 0.004 0.013 -0.003 -0.005
Item_Type_Breakfast -0.002 0.049 0.043 0.001 -0.007 -0.003 -0.001 -0.009 -0.003 -0.003 ... 0.008 0.003 0.000 0.000 0.014 -0.004 -0.007 -0.007 -0.007 0.003
Item_Type_Canned -0.031 0.064 0.013 -0.006 0.011 0.010 -0.000 0.010 -0.008 -0.008 ... -0.013 -0.008 -0.002 0.010 -0.004 0.001 0.012 0.005 0.002 -0.005
Item_Type_Dairy 0.033 0.016 0.037 0.036 -0.005 0.001 -0.010 -0.012 -0.008 -0.008 ... -0.002 0.008 -0.000 -0.002 0.012 -0.011 -0.005 -0.007 0.005 0.005
Item_Type_Frozen Foods 0.001 0.069 -0.003 -0.013 0.008 0.007 -0.009 -0.003 -0.009 -0.010 ... -0.003 -0.002 0.016 -0.002 -0.007 -0.006 -0.002 -0.015 0.018 -0.001
Item_Type_Fruits and Vegetables 0.030 0.093 0.018 0.024 -0.005 -0.014 0.008 0.006 -0.011 -0.012 ... -0.002 0.008 -0.007 0.001 -0.005 0.005 -0.006 0.009 -0.009 0.004
Item_Type_Hard Drinks -0.047 -0.085 -0.003 -0.010 0.007 -0.012 0.002 0.001 -0.004 -0.005 ... 0.006 -0.001 -0.003 -0.003 -0.016 -0.001 -0.003 0.011 -0.008 0.016
Item_Type_Health and Hygiene 0.014 -0.153 -0.054 -0.042 -0.003 -0.011 0.012 0.004 -0.007 -0.007 ... 0.006 0.006 0.007 0.002 -0.005 0.005 -0.011 -0.003 -0.001 -0.007
Item_Type_Household 0.036 -0.211 -0.034 0.047 -0.002 -0.001 0.005 -0.004 -0.009 -0.010 ... 0.012 0.004 -0.005 -0.005 -0.007 -0.001 0.003 -0.000 0.005 -0.005
Item_Type_Meat -0.002 0.087 -0.018 -0.004 -0.013 0.005 0.004 0.004 -0.006 -0.007 ... 0.014 -0.009 -0.004 -0.000 0.013 0.016 -0.006 -0.014 -0.004 0.001
Item_Type_Others 0.027 -0.091 -0.015 -0.019 -0.003 0.014 -0.015 -0.012 -0.004 -0.004 ... -0.003 -0.007 -0.006 0.004 0.023 -0.010 -0.007 0.004 0.010 -0.004
Item_Type_Seafood -0.005 0.003 0.013 0.001 -0.001 0.009 -0.005 -0.003 -0.002 -0.002 ... -0.001 -0.009 -0.009 0.000 0.011 -0.000 0.004 0.009 -0.004 0.000
Item_Type_Snack Foods 0.010 0.058 0.006 0.034 0.003 0.003 -0.002 0.008 -0.011 -0.012 ... -0.010 -0.007 -0.003 0.001 0.001 0.006 0.010 0.002 -0.012 0.009
Item_Type_Soft Drinks -0.047 -0.086 -0.010 -0.036 0.007 0.001 -0.001 -0.005 0.113 0.122 ... -0.002 0.001 0.010 -0.004 -0.003 -0.006 0.006 0.004 -0.001 -0.004
Item_Type_Starchy Foods 0.023 0.026 0.004 0.015 0.012 -0.009 0.007 0.003 -0.004 -0.004 ... -0.002 0.008 0.006 0.003 -0.019 -0.006 0.008 -0.000 -0.003 0.002
Outlet_Identifier_OUT010 0.003 -0.005 0.167 -0.001 0.005 -0.075 0.288 -0.398 0.011 0.009 ... 1.000 -0.092 -0.092 -0.092 -0.068 -0.093 -0.092 -0.092 -0.092 -0.092
Outlet_Identifier_OUT013 0.012 0.007 -0.038 0.002 -0.453 -0.683 0.382 -0.089 0.005 0.003 ... -0.092 1.000 -0.122 -0.122 -0.090 -0.123 -0.123 -0.123 -0.123 -0.123
Outlet_Identifier_OUT017 -0.003 0.009 -0.028 -0.009 0.382 -0.099 -0.048 -0.088 0.005 0.003 ... -0.092 -0.122 1.000 -0.122 -0.090 -0.123 -0.122 -0.122 -0.122 -0.122
Outlet_Identifier_OUT018 0.001 -0.000 -0.031 0.004 0.466 -0.099 0.382 0.351 0.005 -0.010 ... -0.092 -0.122 -0.122 1.000 -0.090 -0.123 -0.122 -0.122 -0.122 -0.122
Outlet_Identifier_OUT019 0.000 0.003 0.186 -0.005 -0.394 0.355 -0.352 -0.388 -0.007 0.010 ... -0.068 -0.090 -0.090 -0.090 1.000 -0.090 -0.090 -0.090 -0.090 -0.090
Outlet_Identifier_OUT027 0.000 -0.006 -0.048 -0.007 -0.538 -0.100 0.383 0.793 -0.009 0.003 ... -0.093 -0.123 -0.123 -0.123 -0.090 1.000 -0.123 -0.123 -0.123 -0.123
Outlet_Identifier_OUT035 -0.002 -0.011 -0.029 0.012 0.258 0.483 -0.049 -0.088 0.005 0.003 ... -0.092 -0.123 -0.122 -0.122 -0.090 -0.123 1.000 -0.122 -0.122 -0.122
Outlet_Identifier_OUT045 -0.017 -0.001 -0.035 -0.000 0.174 -0.100 -0.049 -0.088 0.005 -0.010 ... -0.092 -0.123 -0.122 -0.122 -0.090 -0.123 -0.122 1.000 -0.122 -0.122
Outlet_Identifier_OUT046 0.001 -0.007 -0.035 0.006 -0.035 0.483 -0.479 -0.088 -0.009 -0.010 ... -0.092 -0.123 -0.122 -0.122 -0.090 -0.123 -0.122 -0.122 1.000 -0.122
Outlet_Identifier_OUT049 0.005 0.012 -0.032 -0.004 0.049 -0.100 -0.479 -0.088 -0.009 0.003 ... -0.092 -0.123 -0.122 -0.122 -0.090 -0.123 -0.122 -0.122 -0.122 1.000

1593 rows × 1593 columns

In [1152]:
plt.rcParams['figure.figsize'] = [10,10]
from matplotlib import pyplot as plt
plt.matshow(matrix,cmap='Blues')
plt.show()

"""Stripplot"""

In [1194]:
import seaborn as sns
sns.stripplot(x_dummies['Item_Weight'],x_dummies['Item_MRP'],jitter=0.3,size=4)
plt.show()
x_dummies['Item_Weight'] = np.log(x_dummies['Item_Weight'])
x_dummies['Item_Weight'] = np.sqrt(x_dummies['Item_Weight'])
x_dummies['Item_Visibility'] = np.sqrt(x_dummies['Item_Visibility'])
x_dummies['Item_MRP'] = np.sqrt(x_dummies['Item_MRP'])
x_dummies['Outlet_Establishment_Year'] = np.square(x_dummies['Outlet_Establishment_Year'])
x_dummies['Item_Outlet_Sales'] = np.sqrt(x_dummies['Item_Outlet_Sales'])
train.skew()
train.shape
In [1193]:
sns.pairplot(train)
plt.show()

Categorical Column Vs Numerical Column

In [1192]:
import seaborn as sns
from matplotlib import pyplot as plt
g = sns.FacetGrid(train,col="Outlet_Size")
g = g.map(plt.hist,"Outlet_Establishment_Year")
plt.show()

import seaborn as sns
from matplotlib import pyplot as plt
g = sns.FacetGrid(train,col="Item_Fat_Content")
g = g.map(plt.hist,"Item_Weight")
plt.show()

import seaborn as sns
from matplotlib import pyplot as plt
g = sns.FacetGrid(train,col="Outlet_Size")
g = g.map(plt.hist,"Outlet_Establishment_Year")
plt.show()

Normalization Techniques

Use MinMaxScaler as the default if you are transforming a feature. Its non-disturbing.

You could use RobustScaler if you have outliers and want to reduce their influence.However, you might be better off removing the outliers, instead.

Use standardScaler if you need a relatively normal distribution.

Use Normalizer sparingly-if normalizes sample rows, not feature columns. It can use l2 or l1 normalization.

To know more about the distribution of data , we will do univariate analysis and bivariate analysis of data with plots such as Histogram.

In [1167]:
# Min-Max Scaler , Normalizing X"""
from sklearn.preprocessing import StandardScaler
sc=MinMaxScaler()
sc_data=sc.fit_transform(x_dummies)
sc_data=pd.DataFrame(sc_data)
sc_data.columns=x_dummies.columns
sc_data.head()
Out[1167]:
Item_Weight Item_Fat_Content Item_Visibility Item_MRP Outlet_Establishment_Year Outlet_Size Outlet_Location_Type Outlet_Type Item_Identifier_DRA12 Item_Identifier_DRA24 ... Outlet_Identifier_OUT010 Outlet_Identifier_OUT013 Outlet_Identifier_OUT017 Outlet_Identifier_OUT018 Outlet_Identifier_OUT019 Outlet_Identifier_OUT027 Outlet_Identifier_OUT035 Outlet_Identifier_OUT045 Outlet_Identifier_OUT046 Outlet_Identifier_OUT049
0 0.283 0.250 0.082 0.928 0.583 0.500 0.000 0.333 0.000 0.000 ... 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 1.000
1 0.081 0.500 0.098 0.072 1.000 0.500 1.000 0.667 0.000 0.000 ... 0.000 0.000 0.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
2 0.771 0.250 0.086 0.468 0.583 0.500 0.000 0.333 0.000 0.000 ... 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 1.000
3 0.872 0.500 0.000 0.640 0.542 0.500 1.000 0.000 0.000 0.000 ... 1.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
4 0.260 0.250 0.000 0.096 0.083 0.000 1.000 0.333 0.000 0.000 ... 0.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000

5 rows × 1593 columns

In [1168]:
Y.head()
Out[1168]:
0   3735.138
1    443.423
2   2097.270
3    732.380
4    994.705
Name: Item_Outlet_Sales, dtype: float64

Linear Regression

In [1471]:
"""Importing libraries and splitting the data"""
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
In [1526]:
x_train,x_test,y_train,y_test=train_test_split(x_dummies,Y,test_size=0.2,random_state=0)
reg = LinearRegression()
model1=reg.fit(x_train,y_train)
In [1518]:
len(x_train)
Out[1518]:
6818
In [1519]:
len(y_train)
Out[1519]:
6818
In [1527]:
"""Predicting Y_pred using x_test"""
Y_pred = reg.predict(x_test)
Y_pred
len(Y_pred)
Out[1527]:
1705
In [1528]:
from sklearn.metrics import r2_score, mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test,Y_pred))
rmse
Out[1528]:
10.820120556480678
In [1529]:
r2= r2_score(y_test,Y_pred) # Built-in function score() indicates R-Square value
print(r2*100)
pd.set_option("display.float_format",lambda x: "%.3f" % x)
65.13566107532749
In [1206]:
"""Plotting Residual Plots"""
x_axis = [i for i in range(1,len(Y_pred)+1)]
x_plot = plt.scatter(x_axis,(y_test-Y_pred),c='green')
plt.plot(x_axis,[0]*len(Y_pred),c='yellow')
plt.title('Residual Plot')
Out[1206]:
Text(0.5, 1.0, 'Residual Plot')
In [1563]:
x_dummies.head()
Out[1563]:
Item_Weight Item_Fat_Content Item_Visibility Item_MRP Outlet_Establishment_Year Outlet_Size Outlet_Location_Type Outlet_Type Item_Type_Baking Goods Item_Type_Breads ... Outlet_Identifier_OUT010 Outlet_Identifier_OUT013 Outlet_Identifier_OUT017 Outlet_Identifier_OUT018 Outlet_Identifier_OUT019 Outlet_Identifier_OUT027 Outlet_Identifier_OUT035 Outlet_Identifier_OUT045 Outlet_Identifier_OUT046 Outlet_Identifier_OUT049
0 9.300 1 0.016 249.809 1999.000 1 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 1
1 5.920 2 0.019 48.269 2009.000 1 2 2 0 0 ... 0 0 0 1 0 0 0 0 0 0
2 17.500 1 0.017 141.618 1999.000 1 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 1
3 19.200 2 0.000 182.095 1998.000 1 2 0 0 0 ... 1 0 0 0 0 0 0 0 0 0
4 8.930 1 0.000 53.861 1987.000 0 2 1 0 0 ... 0 1 0 0 0 0 0 0 0 0

5 rows × 34 columns

Lasso-Ridge Regularization

In [1564]:
from mpl_toolkits.mplot3d import Axes3D
from sklearn.linear_model import Ridge, Lasso
In [1565]:
x_train,x_test,y_train,y_test=train_test_split(x_dummies,Y,test_size=0.3,random_state=0)
print(len(x_train),len(y_train))
5966 5966
In [1566]:
reg=LinearRegression()
model=reg.fit(x_train,y_train)
In [1533]:
model.coef_
Out[1533]:
array([-1.61927358e-02,  2.08852555e-01, -2.31791860e+00,  1.60958476e-01,
        2.61166764e-01, -4.02426738e+00, -4.13072360e+00,  1.11451836e+01,
        3.31332590e-01, -2.19645473e-01, -4.99802005e-01,  8.62238635e-01,
       -1.38895050e+00, -4.79410589e-01, -2.43730094e-02, -3.46050271e-04,
       -6.30132151e-02, -2.85007083e-01,  3.86482538e-01,  2.52467303e-01,
        1.63561987e+00,  2.91401442e-01, -7.50120133e-01, -4.88743219e-02,
       -7.97206304e+00,  6.87752890e+00,  2.49894163e+00, -8.82900687e+00,
       -9.00439298e+00,  1.49886724e+00,  7.77422198e+00,  2.44546033e+00,
        4.08343253e+00,  6.27010283e-01])
In [1567]:
model.intercept_
Out[1567]:
-506.3373249477129
In [1568]:
y_pred=model.predict(x_test)
In [1569]:
from sklearn.metrics import r2_score, mean_squared_error
rmse=np.sqrt(mean_squared_error(y_test,y_pred))
r2=r2_score(y_test,y_pred)
print("RMSE=",rmse)
print("R2 score=",r2)
RMSE= 10.822043164075689
R2 score= 0.6523377799908223
In [1570]:
#residual plot
x=[i for i in range(1,len(y_pred)+1)]
x_plot=plt.scatter(x,(y_pred - y_test),c='orange')
plt.plot(x,[0]*len(y_pred),c="blue")
plt.title('Residual plot')
Out[1570]:
Text(0.5, 1.0, 'Residual plot')

Regularization Techniques

In [1537]:
predictors=x_train.columns
predictors
Out[1537]:
Index(['Item_Weight', 'Item_Fat_Content', 'Item_Visibility', 'Item_MRP',
       'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type',
       'Outlet_Type', 'Item_Type_Baking Goods', 'Item_Type_Breads',
       'Item_Type_Breakfast', 'Item_Type_Canned', 'Item_Type_Dairy',
       'Item_Type_Frozen Foods', 'Item_Type_Fruits and Vegetables',
       'Item_Type_Hard Drinks', 'Item_Type_Health and Hygiene',
       'Item_Type_Household', 'Item_Type_Meat', 'Item_Type_Others',
       'Item_Type_Seafood', 'Item_Type_Snack Foods', 'Item_Type_Soft Drinks',
       'Item_Type_Starchy Foods', 'Outlet_Identifier_OUT010',
       'Outlet_Identifier_OUT013', 'Outlet_Identifier_OUT017',
       'Outlet_Identifier_OUT018', 'Outlet_Identifier_OUT019',
       'Outlet_Identifier_OUT027', 'Outlet_Identifier_OUT035',
       'Outlet_Identifier_OUT045', 'Outlet_Identifier_OUT046',
       'Outlet_Identifier_OUT049'],
      dtype='object')
In [1538]:
coef=pd.Series(reg.coef_, predictors).sort_values()
coef
Out[1538]:
Outlet_Identifier_OUT019          -9.004
Outlet_Identifier_OUT018          -8.829
Outlet_Identifier_OUT010          -7.972
Outlet_Location_Type              -4.131
Outlet_Size                       -4.024
Item_Visibility                   -2.318
Item_Type_Dairy                   -1.389
Item_Type_Soft Drinks             -0.750
Item_Type_Breakfast               -0.500
Item_Type_Frozen Foods            -0.479
Item_Type_Household               -0.285
Item_Type_Breads                  -0.220
Item_Type_Health and Hygiene      -0.063
Item_Type_Starchy Foods           -0.049
Item_Type_Fruits and Vegetables   -0.024
Item_Weight                       -0.016
Item_Type_Hard Drinks             -0.000
Item_MRP                           0.161
Item_Fat_Content                   0.209
Item_Type_Others                   0.252
Outlet_Establishment_Year          0.261
Item_Type_Snack Foods              0.291
Item_Type_Baking Goods             0.331
Item_Type_Meat                     0.386
Outlet_Identifier_OUT049           0.627
Item_Type_Canned                   0.862
Outlet_Identifier_OUT027           1.499
Item_Type_Seafood                  1.636
Outlet_Identifier_OUT045           2.445
Outlet_Identifier_OUT017           2.499
Outlet_Identifier_OUT046           4.083
Outlet_Identifier_OUT013           6.878
Outlet_Identifier_OUT035           7.774
Outlet_Type                       11.145
dtype: float64
In [1539]:
coef.plot(kind="bar",title="Modal Coefficients")
Out[1539]:
<matplotlib.axes._subplots.AxesSubplot at 0x2a18ade2828>

Ridge Regression

L2 Regularization

In [1540]:
"""
alpha is hyperparameter of Ridge, which means that they are not automatically learned by the model instead they have to be set 
manually.
"""

#alpha_range=[0.01,0.05,0.5,5]

ridgeReg=Ridge(alpha=0.01, normalize=True)
model=ridgeReg.fit(x_train,y_train)
pred=model.predict(x_test)
rmse=np.sqrt(mean_squared_error(y_test, pred))
r2=r2_score(y_test, pred)
print("RMSE=", rmse, "R2=", r2)
RMSE= 10.828437790653059 R2= 0.6519267990213873
In [1541]:
df=pd.DataFrame(columns=['Linear Reg', "Ridge Reg"])
for i in range(len(ridgeReg.coef_)):
    df=df.append({'Linear Reg': reg.coef_[i], "Ridge Reg":ridgeReg.coef_[i]},ignore_index=True)
df.head()
Out[1541]:
Linear Reg Ridge Reg
0 -0.016 -0.016
1 0.209 0.207
2 -2.318 -2.666
3 0.161 0.159
4 0.261 0.054
In [1495]:
predictors=x_train.columns
coef=pd.Series(ridgeReg.coef_,predictors).sort_values()
coef.plot(kind='bar',title='Model Coefficients')
print(coef)
"""It tells about what which independent variables hold much significance"""
Outlet_Identifier_OUT010          -18.742
Outlet_Identifier_OUT019          -17.996
Outlet_Identifier_OUT018           -4.788
Item_Visibility                    -2.666
Item_Type_Dairy                    -1.286
Outlet_Size                        -1.254
Item_Type_Soft Drinks              -0.677
Outlet_Location_Type               -0.513
Item_Type_Breakfast                -0.417
Item_Type_Frozen Foods             -0.394
Item_Type_Household                -0.191
Item_Type_Breads                   -0.132
Item_Weight                        -0.016
Item_Type_Health and Hygiene        0.001
Item_Type_Starchy Foods             0.042
Outlet_Establishment_Year           0.054
Item_Type_Fruits and Vegetables     0.058
Item_Type_Hard Drinks               0.083
Item_MRP                            0.159
Item_Fat_Content                    0.207
Item_Type_Others                    0.294
Item_Type_Snack Foods               0.375
Item_Type_Baking Goods              0.384
Item_Type_Meat                      0.454
Item_Type_Canned                    0.938
Item_Type_Seafood                   1.702
Outlet_Identifier_OUT045            2.259
Outlet_Identifier_OUT013            2.735
Outlet_Identifier_OUT017            3.332
Outlet_Identifier_OUT049            3.424
Outlet_Identifier_OUT046            3.697
Outlet_Type                         4.807
Outlet_Identifier_OUT035            5.205
Outlet_Identifier_OUT027            6.812
dtype: float64
Out[1495]:
'It tells about what which independent variables hold much significance'

Lasso Regression

1) L1 Regularization

In [1542]:
#alpha_range=[0.01,0.05,0.5,5]

lassoReg=Lasso(alpha=0.01, normalize=True)
model=lassoReg.fit(x_train,y_train)
pred=model.predict(x_test)
rmse=np.sqrt(mean_squared_error(y_test, pred))
r2=r2_score(y_test, pred)
print("RMSE=",rmse)
print("R-square=",r2)
co=lassoReg.coef_
inte=lassoReg.intercept_
RMSE= 11.046124088561202
R-square= 0.6377913557369557
In [1543]:
df=pd.DataFrame(columns=['Linear Reg',"Ridge Reg", 'Lasso Reg'])

for i in range(len(ridgeReg.coef_)):
    df=df.append({'Linear Reg': reg.coef_[i],"Ridge Reg":ridgeReg.coef_[i],"Lasso Reg":lassoReg.coef_[i]},ignore_index=True)
df.head()
Out[1543]:
Linear Reg Ridge Reg Lasso Reg
0 -0.016 -0.016 -0.000
1 0.209 0.207 0.000
2 -2.318 -2.666 -0.000
3 0.161 0.159 0.149
4 0.261 0.054 -0.000
In [1544]:
print("RMSE=",rmse,"R2=",r2)
coef=pd.Series(lassoReg.coef_,predictors).sort_values()
print(coef)
coef.plot(kind="bar",title="Model Coefficients")
RMSE= 11.046124088561202 R2= 0.6377913557369557
Outlet_Identifier_OUT019          -19.224
Outlet_Identifier_OUT010          -19.175
Outlet_Identifier_OUT018           -5.708
Item_Weight                        -0.000
Outlet_Identifier_OUT045           -0.000
Outlet_Identifier_OUT035            0.000
Outlet_Identifier_OUT027            0.000
Outlet_Identifier_OUT017            0.000
Outlet_Identifier_OUT013            0.000
Item_Type_Starchy Foods             0.000
Item_Type_Soft Drinks              -0.000
Item_Type_Snack Foods               0.000
Item_Type_Seafood                   0.000
Item_Type_Others                   -0.000
Item_Type_Meat                      0.000
Item_Type_Household                -0.000
Item_Type_Health and Hygiene       -0.000
Item_Type_Hard Drinks               0.000
Item_Fat_Content                    0.000
Item_Visibility                    -0.000
Outlet_Establishment_Year          -0.000
Outlet_Size                        -0.000
Outlet_Location_Type               -0.000
Outlet_Identifier_OUT046            0.000
Outlet_Identifier_OUT049            0.000
Item_Type_Breads                    0.000
Item_Type_Breakfast                -0.000
Item_Type_Canned                    0.000
Item_Type_Dairy                    -0.000
Item_Type_Frozen Foods             -0.000
Item_Type_Fruits and Vegetables     0.000
Item_Type_Baking Goods              0.000
Item_MRP                            0.149
Outlet_Type                         5.117
dtype: float64
Out[1544]:
<matplotlib.axes._subplots.AxesSubplot at 0x2a1b78201d0>

Analysis of Data

  • (Before Regularization) Previous Model1=> RMSE=10.8201 , R2=65.13
  • (Simple Regularization) Previous Model2(Using Alpha)=>RMSE=10.822043164075689 R2 score= 0.6523377799908223
  • RMSE= 10.828437790653059 R2= 0.6519267990213873[L2 Regularization]
  • RMSE= 11.000884418122189 R2= 0.6396102429778721[L1 Regularization]
  • Clearly observing, L2 regularisation is better than the L2 regularisation and previous models because its accuracy is the maximum
  • Regularization has a significant impact but it fails to increase the value of the R2 square, therefore it is not important to put Regularization on the data Independent Variables:

Item_Identifier,Item_Weight,Item_Fat_Content Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales. "The impact on outlet sales are caused by the positive values of coef---":

Item_Weight 2.232 Item_Type_Soft Drinks 6.674 Item_Type_Meat 8.607 Item_Type_Breads 14.587 Item_MRP 14.724 Item_Fat_Content 33.438 Item_Type_Fruits and Vegetables 56.863 Item_Type_Others 65.276 Item_Type_Starchy Foods 70.085 Outlet_Identifier_OUT045 97.045 Outlet_Identifier_OUT013 139.639 Outlet_Identifier_OUT046 220.273 Outlet_Identifier_OUT049 245.044 Outlet_Identifier_OUT017 258.164 Item_Type_Seafood 304.190 Outlet_Type 389.815 Outlet_Identifier_OUT035 393.441 Outlet_Identifier_OUT027 707.095

"Item_Weight,Item_MRP,Outlet_Identifier, Item_Type, Outlet_Type, Item_Fat_Content"- Independent Variables which cause significant impact on the sales of outlet.

Analysis of Test Data

In [1545]:
test=pd.read_csv("D:/Assignment1_PGD B7/Test.csv")
test.head()
Out[1545]:
Item_Identifier Item_Weight Item_Fat_Content Item_Visibility Item_Type Item_MRP Outlet_Identifier Outlet_Establishment_Year Outlet_Size Outlet_Location_Type Outlet_Type
0 FDW58 20.750 Low Fat 0.008 Snack Foods 107.862 OUT049 1999 Medium Tier 1 Supermarket Type1
1 FDW14 8.300 reg 0.038 Dairy 87.320 OUT017 2007 NaN Tier 2 Supermarket Type1
2 NCN55 14.600 Low Fat 0.100 Others 241.754 OUT010 1998 NaN Tier 3 Grocery Store
3 FDQ58 7.315 Low Fat 0.015 Snack Foods 155.034 OUT017 2007 NaN Tier 2 Supermarket Type1
4 FDY38 nan Regular 0.119 Dairy 234.230 OUT027 1985 Medium Tier 3 Supermarket Type3

Treatment of Test.csv

Imputing of Missing Values

In [1546]:
test.isnull().sum()
Out[1546]:
Item_Identifier                 0
Item_Weight                   976
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  1606
Outlet_Location_Type            0
Outlet_Type                     0
dtype: int64
In [1553]:
me=np.mean(test["Item_Weight"])
test["Item_Weight"].fillna(me,inplace=True)
test["Outlet_Size"].mode()
test["Outlet_Size"].fillna("Medium",inplace=True)
test["Outlet_Size"].head()
test.head()
test.shape
test.isnull().sum()
Out[1553]:
Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
dtype: int64

Converting Categorical Values into Numerical Values

1. Also one-hot encoding is performed on nominal data and label-encoding is performed on ordinal data.

2. One-Hot Encoding or get_dummies for X

In [1554]:
'''Using Get_Dummies Function for nominal Data and Label Encoding 
for Ordinal Data'''
import pandas as pd
test1=test.drop("Item_Identifier",axis=1)
test1.head()
Out[1554]:
Item_Weight Item_Fat_Content Item_Visibility Item_Type Item_MRP Outlet_Identifier Outlet_Establishment_Year Outlet_Size Outlet_Location_Type Outlet_Type
0 20.750 Low Fat 0.008 Snack Foods 107.862 OUT049 1999 Medium Tier 1 Supermarket Type1
1 8.300 reg 0.038 Dairy 87.320 OUT017 2007 Medium Tier 2 Supermarket Type1
2 14.600 Low Fat 0.100 Others 241.754 OUT010 1998 Medium Tier 3 Grocery Store
3 7.315 Low Fat 0.015 Snack Foods 155.034 OUT017 2007 Medium Tier 2 Supermarket Type1
4 12.696 Regular 0.119 Dairy 234.230 OUT027 1985 Medium Tier 3 Supermarket Type3
In [1555]:
test_dummies=pd.get_dummies(test1[["Item_Type","Outlet_Identifier"]]) #Use of get_dummies instead of One-Hot Encoding
test_dummies.head()
Out[1555]:
Item_Type_Baking Goods Item_Type_Breads Item_Type_Breakfast Item_Type_Canned Item_Type_Dairy Item_Type_Frozen Foods Item_Type_Fruits and Vegetables Item_Type_Hard Drinks Item_Type_Health and Hygiene Item_Type_Household ... Outlet_Identifier_OUT010 Outlet_Identifier_OUT013 Outlet_Identifier_OUT017 Outlet_Identifier_OUT018 Outlet_Identifier_OUT019 Outlet_Identifier_OUT027 Outlet_Identifier_OUT035 Outlet_Identifier_OUT045 Outlet_Identifier_OUT046 Outlet_Identifier_OUT049
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
1 0 0 0 0 1 0 0 0 0 0 ... 0 0 1 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 ... 1 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 ... 0 0 1 0 0 0 0 0 0 0
4 0 0 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 1 0 0 0 0

5 rows × 26 columns

In [1556]:
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
le=LabelEncoder()
test1["Outlet_Size"]=le.fit_transform(test1["Outlet_Size"])
test1["Outlet_Location_Type"]=le.fit_transform(test1["Outlet_Location_Type"])
test1["Outlet_Type"]=le.fit_transform(test1["Outlet_Type"])
test1["Item_Fat_Content"]=le.fit_transform(test1["Item_Fat_Content"])
test_dummies=pd.get_dummies(test1)
test_dummies.head()
test_dummies.shape
Y1=Y[0:5681]
In [1557]:
x_train,x_test,y_test,y_train=train_test_split(test_dummies,Y1,test_size=0.2,random_state=0)
In [1558]:
len(y_train)
predicted_values=model.predict(test_dummies)
print(predicted_values)
"""Predicted Values are:"""
[39.92448467 36.87369798 35.51712395 ... 41.54055926 55.77953405
 35.75632567]
Out[1558]:
'Predicted Values are:'
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: